### Dan Hopkins
### 12.11.2020
### ACA attitudes

### Goal: merge KFF geo-coded data with
### exchange pricing data

library(xlsx)
library(dplyr)

### load geo-merged KFF data
### This loads geo-coded KFF surveys; they must be obtained separately
### Code to produce merged/cleaned geo-coded surveys:
### from: kff-loadall-geomerge-ACTIVE-12192018.R
###
load("kff-geocodes-merge-12212018.Rdata")

### load exchange market - county crosswalk
exx <- read.csv("data/exchangedata/Working ACAGeography Counties & Ratings Area12202018.csv")
exx$STCOFIPS <- as.numeric(exx$AL*1000+exx$FIPS.CODE)

exx.sub <- subset(exx,select=c("STATE","STCOFIPS","COUNTY.NAME","Rating.Area.ID","Rating.Area.Number","RatingAreaName"))

dta14 <- read.csv("data/exchangedata/summary_ratingarea_2014.csv")
length(unique(dta14$area))
idx <- which(colnames(dta14)=="area")
colnames(dta14)[idx] <- "RatingAreaName"
dim(dta14)
sum(dta14$RatingAreaName %in% exx.sub$RatingAreaName)
dta14$RatingAreaName[which(! dta14$RatingAreaName %in% exx.sub$RatingAreaName)]
cn2 <- cn <- colnames(dta14)
n <- length(cn2)
cn2[3:n] <- paste(cn[3:n],".14",sep="")
length(cn2)==length(cn)
colnames(dta14) <- cn2

dta15 <- read.csv("data/exchangedata/summary_ratingarea_2015.csv")
length(unique(dta15$area))
idx <- which(colnames(dta15)=="area")
colnames(dta15)[idx] <- "RatingAreaName"
dim(dta15)
sum(dta15$RatingAreaName %in% exx.sub$RatingAreaName)
dta15$RatingAreaName[which(! dta15$RatingAreaName %in% exx.sub$RatingAreaName)]
cn2 <- cn <- colnames(dta15)
n <- length(cn2)
cn2[3:n] <- paste(cn[3:n],".15",sep="")
length(cn2)==length(cn)
colnames(dta15) <- cn2

dta16 <- read.csv("data/exchangedata/summary_ratingarea_2016.csv")
length(unique(dta16$area))
idx <- which(colnames(dta16)=="area")
colnames(dta16)[idx] <- "RatingAreaName"
dim(dta16)
sum(dta16$RatingAreaName %in% exx.sub$RatingAreaName)
dta16$RatingAreaName[which(! dta16$RatingAreaName %in% exx.sub$RatingAreaName)]
cn2 <- cn <- colnames(dta16)
n <- length(cn2)
cn2[3:n] <- paste(cn[3:n],".16",sep="")
length(cn2)==length(cn)
colnames(dta16) <- cn2

dta17 <- read.csv("data/exchangedata/summary_ratingarea_2017.csv")
length(unique(dta17$area))
idx <- which(colnames(dta17)=="area")
colnames(dta17)[idx] <- "RatingAreaName"
dim(dta17)
sum(dta17$RatingAreaName %in% exx.sub$RatingAreaName)
dta17$RatingAreaName[which(! dta17$RatingAreaName %in% exx.sub$RatingAreaName)]
cn2 <- cn <- colnames(dta17)
n <- length(cn2)
cn2[3:n] <- paste(cn[3:n],".17",sep="")
length(cn2)==length(cn)
colnames(dta17) <- cn2

dta14$st <- as.character(dta14$st)
dta15$st <- as.character(dta15$st)
dta16$st <- as.character(dta16$st)
dta17$st <- as.character(dta17$st)

dta14$RatingAreaName <- as.character(dta14$RatingAreaName)
dta15$RatingAreaName <- as.character(dta15$RatingAreaName)
dta16$RatingAreaName <- as.character(dta16$RatingAreaName)
dta17$RatingAreaName <- as.character(dta17$RatingAreaName)

dta1415 <- full_join(dta14,dta15)
dta141516 <- full_join(dta1415,dta16)
dta14151617 <- full_join(dta141516,dta17)

dta14$RatingAreaName[! dta14$RatingAreaName %in% dta15$RatingAreaName]
dta15$RatingAreaName[! dta15$RatingAreaName %in% dta14$RatingAreaName]

exx.sub$RatingAreaName <- as.character(exx.sub$RatingAreaName)
exx.sub2 <- exx.sub[! exx.sub$STCOFIPS %in% c(NA),]
dim(exx.sub)
dim(exx.sub2)

##### load county Census demos
CO16A <- read.table("data/censusdata/SocialExplorer-2016ACS-5y-CountyData-R11818880_SL050.txt",header=T)
colnames(CO16A)[1] <- "STCOFIPS"
CO16B <- read.table("data/censusdata/SocialExplorer-2016ACS-5y-CountyMobility-R11818891_SL050.txt",header=T)
colnames(CO16B)[1] <- "STCOFIPS"
CO16 <- inner_join(CO16A,CO16B)
cn2 <- cn <- colnames(CO16)
cn2[56:193] <- paste(cn[56:193],".16",sep="")
length(cn2)==length(cn)
colnames(CO16) <- cn2

CO10 <- read.table("data/censusdata/SocialExplorer-2010ACS-5y-CountyData-R11818885_SL050.txt",header=T)
colnames(CO10)[1] <- "STCOFIPS"
cn2 <- cn <- colnames(CO10)
cn2[56:189] <- paste(cn[56:189],".10",sep="")
length(cn2)==length(cn)
colnames(CO10) <- cn2

CO00 <- read.table("data/censusdata/SocialExplorer-2000Census-2010Geographies-CountyData-R11818901_SL050.txt",header=T)
colnames(CO00)[1] <- "STCOFIPS"
cn2 <- cn <- colnames(CO00)
cn2[8:205] <- paste(cn[8:205],".00",sep="")
length(cn2)==length(cn)
colnames(CO00) <- cn2

### merge 2000, 2010
CO0010 <- inner_join(CO00,CO10,by="STCOFIPS")

### merge 2000-2010, 2016
COALL <- inner_join(CO0010,CO16,by="STCOFIPS")

#### recode county-level
#### demographics

COALL$COPCNHWH00 <- COALL$SE_T013_003.00/COALL$SE_T001_001.00
COALL$COPCNHWH10 <- COALL$SE_T014_003.10/COALL$SE_T001_001.10
COALL$COPCNHWH16 <- COALL$SE_T014_003.16/COALL$SE_T001_001.16

COALL$COPCNHBL00 <- COALL$SE_T013_004.00/COALL$SE_T013_001.00
COALL$COPCNHBL10 <- COALL$SE_T014_004.10/COALL$SE_T001_001.10
COALL$COPCNHBL16 <- COALL$SE_T014_004.16/COALL$SE_T001_001.16

COALL$COPCNHAS00 <- COALL$SE_T013_006.00/COALL$SE_T013_001.00
COALL$COPCNHAS10 <- COALL$SE_T014_006.10/COALL$SE_T001_001.10
COALL$COPCNHAS16 <- COALL$SE_T014_006.16/COALL$SE_T001_001.16

COALL$COPCHIS00 <- COALL$SE_T013_010.00/COALL$SE_T013_001.00
COALL$COPCHIS10 <- COALL$SE_T014_010.10/COALL$SE_T001_001.10
COALL$COPCHIS16 <- COALL$SE_T014_010.16/COALL$SE_T001_001.16

COALL$COPCPOOR00 <- COALL$SE_T181_002.00/COALL$SE_T181_001.00
COALL$COPCPOOR10 <- COALL$SE_T115_002.10/COALL$SE_T115_001.10
COALL$COPCPOOR16 <- COALL$SE_T115_002.16/COALL$SE_T115_001.16

### foreign born non-citizen

COALL$COPCFBNC00 <- COALL$SE_T204_005.00/COALL$SE_T204_001.00
COALL$COPCFBNC10 <- COALL$SE_T133_005.10/COALL$SE_T133_001.10
COALL$COPCFBNC16 <- COALL$SE_T133_005.16/COALL$SE_T133_001.16

### foreign born citizens
COALL$COPCFBCZ00 <- COALL$SE_T204_004.00/COALL$SE_T204_001.00
COALL$COPCFBCZ10 <- COALL$SE_T133_004.10/COALL$SE_T133_001.10
COALL$COPCFBCZ16 <- COALL$SE_T133_004.16/COALL$SE_T133_001.16

###
COALL$COMHINC00 <- COALL$SE_T091_001.00
COALL$COMHINC10 <- COALL$SE_T057_001.10
COALL$COMHINC16 <- COALL$SE_T057_001.16

### education

COALL$COPCBCH00 <- (COALL$SE_T038_005.00+COALL$SE_T038_006.00+COALL$SE_T038_007.00+COALL$SE_T038_008.00)/COALL$SE_T038_001.00
COALL$COPCBCH10 <- (COALL$SE_T025_005.10+COALL$SE_T025_006.10+COALL$SE_T025_007.10+COALL$SE_T025_008.10)/COALL$SE_T025_001.10
COALL$COPCBCH16 <- (COALL$SE_T025_005.16+COALL$SE_T025_006.16+COALL$SE_T025_007.16+COALL$SE_T025_008.16)/COALL$SE_T025_001.16

COALL$COPCHSL00 <- (COALL$SE_T038_002.00+COALL$SE_T038_003.00)/COALL$SE_T038_001.00
COALL$COPCHSL10 <- (COALL$SE_T025_002.10+COALL$SE_T025_003.10)/COALL$SE_T025_001.10
COALL$COPCHSL16 <- (COALL$SE_T025_002.16+COALL$SE_T025_003.16)/COALL$SE_T025_001.16

COALL$COPCSOCL00 <- (COALL$SE_T038_004.00)/COALL$SE_T038_001.00
COALL$COPCSOCL10 <- (COALL$SE_T025_004.10)/COALL$SE_T025_001.10
COALL$COPCSOCL16 <- (COALL$SE_T025_004.16)/COALL$SE_T025_001.16


#### labor market status
### employed
COALL$COPCEMP00 <- (COALL$SE_T067_005.00+COALL$SE_T067_003.00)/COALL$SE_T067_001.00
COALL$COPCEMP10 <- (COALL$SE_T033_003.10+COALL$SE_T033_005.10)/COALL$SE_T033_001.10
COALL$COPCEMP16 <- (COALL$SE_T033_003.16+COALL$SE_T033_005.16)/COALL$SE_T033_001.16

### unemployed
COALL$COPCUNEMP00 <- (COALL$SE_T067_006.00)/COALL$SE_T067_001.00
COALL$COPCUNEMP10 <- (COALL$SE_T033_006.10)/COALL$SE_T033_001.10
COALL$COPCUNEMP16 <- (COALL$SE_T033_006.16)/COALL$SE_T033_001.16

### not in labor force
COALL$COPCNOLABFOR00 <- (COALL$SE_T067_007.00)/COALL$SE_T067_001.00
COALL$COPCNOLABFOR10 <- COALL$SE_T033_007.10/COALL$SE_T033_001.10
COALL$COPCNOLABFOR16 <- COALL$SE_T033_007.16/COALL$SE_T033_001.16

### pct. construction

COALL$COPCCONST00 <- COALL$SE_T083_005.00/COALL$SE_T083_001.00
COALL$COPCCONST10 <- COALL$SE_T049_003.10/COALL$SE_T049_001.10
COALL$COPCCONST16 <- COALL$SE_T049_003.16/COALL$SE_T049_001.16

### pct. manufacturing
COALL$COPCMANUF00 <- COALL$SE_T083_006.00/COALL$SE_T083_001.00
COALL$COPCMANUF10 <- COALL$SE_T049_004.10/COALL$SE_T049_001.10
COALL$COPCMANUF16 <- COALL$SE_T049_004.16/COALL$SE_T049_001.16

### pct. public sector
COALL$COPCPUBSEC00 <- COALL$SE_T083_027.00/COALL$SE_T083_001.00
COALL$COPCPUBSEC10 <- COALL$SE_T049_014.10/COALL$SE_T049_001.10
### this gives total employed
COALL$COPCPUBSEC16 <- COALL$SE_T049_014.16/COALL$SE_T049_001.16

COALL$COPOPDNS00 <- COALL$SE_T002_002.00
COALL$COPOPDNS10 <- COALL$SE_T002_002.10
COALL$COPOPDNS16 <- COALL$SE_T002_002.16

COALL$COTOTPOP00 <- COALL$SE_T001_001.00
COALL$COTOTPOP10 <- COALL$SE_T001_001.10
COALL$COTOTPOP16 <- COALL$SE_T001_001.16

### percent with social security income
COALL$COPCSSINC00 <- COALL$SE_T140_002.00/COALL$SE_T140_001.00
COALL$COPCSSINC10 <- COALL$SE_T078_002.10/COALL$SE_T080_001.10
COALL$COPCSSINC16 <- COALL$SE_T078_002.16/COALL$SE_T080_001.16

### percent with public assistance income
COALL$COPCPAINC00 <- COALL$SE_T142_002.00/COALL$SE_T142_001.00
COALL$COPCPAINC10 <- COALL$SE_T080_002.10/COALL$SE_T080_001.10
COALL$COPCPAINC16 <- COALL$SE_T080_002.16/COALL$SE_T080_001.16

###

COALL$COPCUND1800 <- (COALL$SE_T007_002.00+COALL$SE_T007_003.00+COALL$SE_T007_004.00+COALL$SE_T007_005.00)/COALL$SE_T007_001.00
COALL$COPCUND1810 <- (COALL$SE_T007_002.10+COALL$SE_T007_003.10+COALL$SE_T007_004.10+COALL$SE_T007_005.10)/COALL$SE_T007_001.10
COALL$COPCUND1816 <- (COALL$SE_T007_002.16+COALL$SE_T007_003.16+COALL$SE_T007_004.16+COALL$SE_T007_005.16)/COALL$SE_T007_001.16

COALL$COPCOVER6500 <- (COALL$SE_T007_011.00+COALL$SE_T007_012.00+COALL$SE_T007_013.00)/COALL$SE_T007_001.00
COALL$COPCOVER6510 <- (COALL$SE_T007_011.10+COALL$SE_T007_012.10+COALL$SE_T007_013.10)/COALL$SE_T007_001.10
COALL$COPCOVER6516 <- (COALL$SE_T007_011.16+COALL$SE_T007_012.16+COALL$SE_T007_013.16)/COALL$SE_T007_001.16

COALL$COPCSACO00 <- (COALL$SE_T199_002.00+COALL$SE_T199_005.00)/COALL$SE_T199_001.00

### merge exchange xwalk, annual exchange data
ctymkt <- left_join(exx.sub2,dta14151617)
dim(ctymkt)

### merge exchange data, survey respondents
gdta.mkt <- left_join(gdta.m,ctymkt,by="STCOFIPS")
sum(gdta.m$STCOFIPS %in% ctymkt$STCOFIPS)
#[1] 84583

gdta.co <- left_join(gdta.mkt,COALL,by="STCOFIPS")

#### fix Los Angeles
gdta.co$ZIP[gdta.co$STCOFIPS==6037]
gdta.co$RatingArea[gdta.co$STCOFIPS==6037 & substr(gdta.co$ZIP,start=1,stop=3) %in% as.character(906:912,915,917:918,935)] <- "CA15"
gdta.co$RatingArea[gdta.co$STCOFIPS==6037 & substr(gdta.co$ZIP,start=1,stop=3) %in% as.character(913:914,916,923,928,932)] <- "CA16"

n <- length(colnames(dta14151617))
cn <- colnames(dta14151617)[3:n]

idx <- which(colnames(gdta.co) %in% cn)
identical(colnames(gdta.co)[idx],cn)

idx.la.15 <- which(dta14151617$RatingAreaName=="CA15")
idx.la.16 <- which(dta14151617$RatingAreaName=="CA16")

gdta.co[gdta.co$STCOFIPS==6037 & gdta.co$RatingArea %in% c("CA15"),idx] <- dta14151617[idx.la.15,3:n]
gdta.co[gdta.co$STCOFIPS==6037 & gdta.co$RatingArea %in% c("CA16"),idx] <- dta14151617[idx.la.16,3:n]

###### how many observations in LA does this fail for?
hold <- gdta.co[gdta.co$STCOFIPS==6037 & gdta.co$RatingArea %in% c("CA15/16"),idx]
dim(hold)
#[1] 2073   64
#[1] 1428   64

### new function for calculating means with NAs
mean.na <- function(x){
	mean(x,na.rm=T)
}

### Add LA averages for people without ZIP codes
gdta.co[gdta.co$STCOFIPS==6037 & gdta.co$RatingArea %in% c("CA15/16"),idx] <- apply(dta14151617[c(idx.la.15,idx.la.16),3:n],2,mean.na)

### save merged data file
#save(gdta.co,file="data/kff-allgeocodes-12212018.Rdata")
